library(tidyverse) # for data cleaning and plotting
library(googlesheets4) # for reading googlesheet data
library(lubridate) # for date manipulation
library(openintro) # for the abbr2state() function
library(palmerpenguins)# for Palmer penguin data
library(maps) # for map data
library(ggmap) # for mapping points on maps
library(gplots) # for col2hex() function
library(RColorBrewer) # for color palettes
library(sf) # for working with spatial data
library(leaflet) # for highly customizable mapping
library(carData) # for Minneapolis police stops data
library(ggthemes) # for more themes (including theme_map())
gs4_deauth() # To not have to authorize each time you knit.
theme_set(theme_minimal())
# Starbucks locations
Starbucks <- read_csv("https://www.macalester.edu/~ajohns24/Data/Starbucks.csv")
starbucks_us_by_state <- Starbucks %>%
filter(Country == "US") %>%
count(`State/Province`) %>%
mutate(state_name = str_to_lower(abbr2state(`State/Province`)))
# Lisa's favorite St. Paul places - example for you to create your own data
favorite_stp_by_lisa <- tibble(
place = c("Home", "Macalester College", "Adams Spanish Immersion",
"Spirit Gymnastics", "Bama & Bapa", "Now Bikes",
"Dance Spectrum", "Pizza Luce", "Brunson's"),
long = c(-93.1405743, -93.1712321, -93.1451796,
-93.1650563, -93.1542883, -93.1696608,
-93.1393172, -93.1524256, -93.0753863),
lat = c(44.950576, 44.9378965, 44.9237914,
44.9654609, 44.9295072, 44.9436813,
44.9399922, 44.9468848, 44.9700727)
)
#COVID-19 data from the New York Times
covid19 <- read_csv("https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-states.csv")
These exercises will reiterate what you learned in the “Mapping data with R” tutorial. If you haven’t gone through the tutorial yet, you should do that first.
ggmap)Starbucks locations to a world map. Add an aesthetic to the world map that sets the color of the points according to the ownership type. What, if anything, can you deduce from this visualization?world <- get_stamenmap(
bbox = c(left = -180, bottom = -57, right = 179, top = 82.1),
maptype = "terrain",
zoom = 2)
ggmap(world) +
geom_point(data = Starbucks,
aes(x = Longitude, y = Latitude, color = `Ownership Type`),
alpha = .2,
size = .2) +
theme_map() +
theme(legend.background = element_blank())
Twin_Cities <- get_stamenmap(
bbox = c(left = -93.67, bottom = 44.75, right = -92.59, top = 45.18),
maptype = "terrain",
zoom = 12)
ggmap(Twin_Cities) +
geom_point(data = Starbucks,
aes(x = Longitude, y = Latitude, color = `Ownership Type`),
size = 2) +
theme_map() +
theme(legend.background = element_blank())
The zoom number makes us control the details of the plot. When the number is big, we see more details in the plot and when it is small, we see less.
get_stamenmap() in help and look at maptype). Include a map with one of the other map types.Different maptypes available with get_stamenmap: “terrain”, “terrain-background”, “terrain-labels”, “terrain-lines”, “toner”, “toner-2010”, “toner-2011”, “toner-background”, “toner-hybrid”, “toner-labels”, “toner-lines”, “toner-lite”, “watercolor”
Twin_Cities <- get_stamenmap(
bbox = c(left = -93.67, bottom = 44.75, right = -92.59, top = 45.18),
maptype = "toner-2011",
zoom = 10)
ggmap(Twin_Cities) +
geom_point(data = Starbucks,
aes(x = Longitude, y = Latitude, color = `Ownership Type`),
size = 2) +
theme_map() +
theme(legend.background = element_blank())
annotate() function (see ggplot2 cheatsheet).Twin_Cities <- get_stamenmap(
bbox = c(left = -93.67, bottom = 44.75, right = -92.59, top = 45.18),
maptype = "toner-2011",
zoom = 10)
ggmap(Twin_Cities) +
geom_point(data = Starbucks,
aes(x = Longitude, y = Latitude, color = `Ownership Type`),
size = 2) +
theme_map() +
theme(legend.background = element_blank()) +
annotate(geom = "text", x = -93.1712321, y = 44.9308890, label = "Macalester College") +
annotate(geom = "point", x = -93.1712321, y = 44.9378965, color = "Dark green", size = 4)
geom_map())The example I showed in the tutorial did not account for population of each state in the map. In the code below, a new variable is created, starbucks_per_10000, that gives the number of Starbucks per 10,000 people. It is in the starbucks_with_2018_pop_est dataset.
census_pop_est_2018 <- read_csv("https://www.dropbox.com/s/6txwv3b4ng7pepe/us_census_2018_state_pop_est.csv?dl=1") %>%
separate(state, into = c("dot","state"), extra = "merge") %>%
select(-dot) %>%
mutate(state = str_to_lower(state))
starbucks_with_2018_pop_est <-
starbucks_us_by_state %>%
left_join(census_pop_est_2018,
by = c("state_name" = "state")) %>%
mutate(starbucks_per_10000 = (n/est_pop_2018)*10000)
dplyr review: Look through the code above and describe what each line of code does.Line 1: It reads in the data of the estimated state population by the US Census in 2018 line 2 : Separates the state and the dot, then merges the name of the state. becuase of this separation between dot and state, the default would otherwise leave out the second word of a state with two words in its name, such as New York line 3: Deletes the dot as a column line 4: Mutates and makes all the state names lower cases line 5: Saves starbucks_us_by_state to starbucks_with_2018_pop_est line 6: Left join of the Census Population Estimate 2018 data set by state name line 7: Creates a new variable, named starbucks_per_1000 which first divides the total number of Starbucks in a state by the estimated population number in 2018, then multiplies the number with 10,000.
states_map <- map_data("state")
starbucks_with_2018_pop_est %>%
ggplot() +
geom_map(map = states_map,
aes(map_id = state_name,
fill = starbucks_per_10000)) +
geom_point(data = Starbucks %>% filter(!`State/Province` %in% c("HI", "AK"), `Country` == "US"),
aes(x = Longitude, y = Latitude),
size = .03,
alpha = .3,
color = "Orange") +
expand_limits(x = states_map$long, y = states_map$lat) +
labs(title = "Starbucks in the US",
caption = "Created by Donia Khraishi") +
theme(legend.background = element_blank()) +
theme_map() +
scale_fill_viridis_c()
From the map we can see that the number of starbucks per 10000 people is more in some states than others. States on the West coast seem to have a lower number of starbucks per 10000 compared to other parts of the US on average. ### A few of your favorite things (
leaflet)
tibble() function that has 10-15 rows of your favorite places. The columns will be the name of the location, the latitude, the longitude, and a column that indicates if it is in your top 3 favorite locations or not. For an example of how to use tibble(), look at the favorite_stp_by_lisa I created in the data R code chunk at the beginning.favorite_step_by_Donia <- tibble(
place = c("Minnehaha Falls", "Holy Land", " Gandhi Mahal", "My Apartment", "Macalester College", "guthrie theater", "lake harriet", "Nicollet island", "Golbal Market", "buca di beppo"),
long = c(-93.2109830, -93.2470188, -93.2329613, -93.1565140, -93.1694827, -93.2555321, -93.3086275, -93.2644138, -93.2611762, -93.1765283),
lat = c( 44.9153316, 45.0136777, 44.9479311, 44.9391415, 44.9412276, 44.9784450, 44.9275383, 44.9867165, 44.9485243, 44.8974544 ),
favorite_place = c("Yes", "Yes", "No", "No", "No", "Yes", "No", "No", "No", "No")
)
leaflet map that uses circles to indicate your favorite places. Label them with the name of the place. Choose the base map you like best. Color your 3 favorite places differently than the ones that are not in your top 3 (HINT: colorFactor()). Add a legend that explains what the colors mean.pal <- colorFactor(
palette = c("#111D4A", "#5998C5"),
domain = favorite_step_by_Donia$favorite_place)
leaflet(data = favorite_step_by_Donia) %>%
addProviderTiles(providers$CartoDB.DarkMatter) %>%
addCircles(lng = ~long,
lat = ~lat,
label = ~place,
weight = 9,
opacity = 2,
color = ~pal(favorite_place)) %>%
addLegend(pal = pal,
values = ~favorite_place,
opacity = 1,
title = "Favorite Place",
position = "bottomright") %>%
addPolylines(lng = ~long,
lat = ~lat,
color = c("#829399"))
Connect all your locations together with a line in a meaningful way (you may need to order them differently in the original data).
If there are other variables you want to add that could enhance your plot, do that now.
This section will revisit some datasets we have used previously and bring in a mapping component.
The data come from Washington, DC and cover the last quarter of 2014.
Two data tables are available:
Trips contains records of individual rentalsStations gives the locations of the bike rental stationsHere is the code to read in the data. We do this a little differently than usualy, which is why it is included here rather than at the top of this file. To avoid repeatedly re-reading the files, start the data import chunk with {r cache = TRUE} rather than the usual {r}. This code reads in the large dataset right away.
data_site <-
"https://www.macalester.edu/~dshuman1/data/112/2014-Q4-Trips-History-Data.rds"
Trips <- readRDS(gzcon(url(data_site)))
Stations<-read_csv("http://www.macalester.edu/~dshuman1/data/112/DC-Stations.csv")
Stations to make a visualization of the total number of departures from each station in the Trips data. Use either color or size to show the variation in number of departures. This time, plot the points on top of a map. Use any of the mapping tools you’d like.Stations2 <- Stations %>%
left_join(Trips,
by = c("name" = "sstation")) %>%
group_by(long, lat) %>%
summarize(total_departures = n())
Washington_DC <- get_stamenmap(
bbox = c(left = -77.1732, bottom = 38.8159, right = -76.7990, top = 39.0058),
maptype = "toner-2011",
zoom = 11)
ggmap(Washington_DC) +
geom_point(data = Stations2,
aes(x = long, y = lat, color = total_departures),
size = 2) +
theme_map() +
theme(legend.background = element_blank())
scale_color_viridis_c()
## <ScaleContinuous>
## Range:
## Limits: 0 -- 1
Stations2 <- Stations %>%
left_join(Trips,
by = c("name" = "sstation")) %>%
group_by(long, lat) %>%
summarize(percent_casual= mean(client == "Casual"))
Washington_DC <- get_stamenmap(
bbox = c(left = -77.1732, bottom = 38.8159, right = -76.7990, top = 39.0058),
maptype = "toner-2011",
zoom = 11)
ggmap(Washington_DC) +
geom_point(data = Stations2,
aes(x = long, y = lat, color = percent_casual),
size = 2) +
theme_map() +
scale_color_viridis_c()
There is a high percentage of casual riders renting out bikes from stations in the downtown area, especially along the river. This makes sense since these that do not live in downtown, would be the casual riders that rent out bikes when visiting. Visiteros like to see the downtown and the river area usually. ### COVID-19 data
The following exercises will use the COVID-19 data from the NYT.
states_map <- map_data("state")
covid19 %>%
group_by(state) %>%
summarize(cumulative_cases = max(cases)) %>%
mutate(state = str_to_lower(state)) %>%
ggplot(aes(fill = cumulative_cases)) +
geom_map(map = states_map,
aes(map_id = state)) +
expand_limits(x = states_map$long, y = states_map$lat) +
labs(title = "Cumulative COVID-19 cases in the US") +
theme(legend.background = element_blank()) +
theme_map() +
scale_fill_viridis_c()
The map shows that CA and TX have the highest cumulative cases in the country, however, they have the highest populations in the US. So the map needs to look at the covid cases in each states proportional to the population. What is showing now is not fair since some sates have high populations so it is logical that they would have more cases while other states like the dakotas have small populations.
covid19_population <-
covid19 %>%
mutate(state = str_to_lower(state)) %>%
left_join(census_pop_est_2018,
by = "state") %>%
group_by(state, est_pop_2018) %>%
summarize(cumulative_cases = max(cases)) %>%
mutate(cases_per_10000 = (cumulative_cases/est_pop_2018)*10000)
states_map <- map_data("state")
covid19_population %>%
mutate(state = str_to_lower(state)) %>%
ggplot() +
geom_map(map = states_map,
aes(map_id = state, fill = cases_per_10000)) +
expand_limits(x = states_map$long, y = states_map$lat) +
labs(title = "Cumulative COVID-19 cases per 10,000 people in the US") +
theme(legend.background = element_blank()) +
theme_map() +
scale_fill_viridis_c()
covid19_date <-
covid19 %>%
filter(date %in% ymd(c("2020-04-20", "2020-05-20", "2020-06-20", "2020-07-20"))) %>%
mutate(state = str_to_lower(state)) %>%
left_join(census_pop_est_2018,
by = "state") %>%
mutate(cases_per_10000 = (cases/est_pop_2018)*10000)
covid19_date %>%
mutate(state = str_to_lower(state)) %>%
ggplot() +
geom_map(map = states_map,
aes(map_id = state, fill = cases_per_10000)) +
expand_limits(x = states_map$long, y = states_map$lat) +
facet_wrap(~date) +
labs(title = "Cumulative COVID-19 cases per 10,000 people in the US") +
theme_map() +
theme(legend.background = element_blank()) +
scale_fill_viridis_c()
I choose dates from April, May, June and july, and we can see that the numbers almost in all the states for the cases per 10000 has increased over time.
These exercises use the datasets MplsStops and MplsDemo from the carData library. Search for them in Help to find out more information.
MplsStops dataset to find out how many stops there were for each neighborhood and the proportion of stops that were for a suspicious vehicle or person. Sort the results from most to least number of stops. Save this as a dataset called mpls_suspicious and display the table.mpls_suspicious <- MplsStops %>%
group_by(neighborhood) %>%
count(problem) %>%
mutate(proportion_suspicious = n/sum(n)) %>%
filter(problem == "suspicious")
mpls_suspicious %>%
arrange(desc(n))
leaflet map and the MplsStops dataset to display each of the stops on a map as a small point. Color the points differently depending on whether they were for suspicious vehicle/person or a traffic stop (the problem variable). HINTS: use addCircleMarkers, set stroke = FAlSE, use colorFactor() to create a palette.pal <- colorFactor(
palette = "viridis",
domain = MplsStops$problem)
leaflet(data = MplsStops,
options = leafletOptions(preferCanvas = TRUE)) %>%
addProviderTiles(providers$Stamen.TonerHybrid,
options = providerTileOptions(updateWhenZooming = FALSE,updateWhenIdle = TRUE)) %>%
addCircles(lng = ~long,
lat = ~lat,
label = ~problem,
weight = 10,
stroke = FALSE,
opacity = 1,
color = ~pal(problem)) %>%
addLegend(pal = pal,
values = ~problem,
opacity = 1,
title = "Problem",
position = "bottomright")
eval=FALSE. Although it looks like it only links to the .sph file, you need the entire folder of files to create the mpls_nbhd data set. These data contain information about the geometries of the Minneapolis neighborhoods. Using the mpls_nbhd dataset as the base file, join the mpls_suspicious and MplsDemo datasets to it by neighborhood (careful, they are named different things in the different files). Call this new dataset mpls_all.Ask Rayyan
mpls_nbhd <- st_read("Minneapolis_Neighborhoods/Minneapolis_Neighborhoods.shp", quiet = TRUE)
mpls_all<- mpls_nbhd %>%
st_as_sf(coords = c("long", "lat"), crs = "NAD27") %>%
group_by(BDNAME) %>%
summarise(geometry = st_combine(geometry)) %>%
st_cast("POLYGON") %>%
left_join(mpls_suspicious,
by = c("BDNAME" = "neighborhood")) %>%
left_join(MplsDemo,
by = c("BDNAME" = "neighborhood"))
leaflet to create a map from the mpls_all data that colors the neighborhoods by prop_suspicious. Display the neighborhood name as you scroll over it. Describe what you observe in the map.palette_mpls_all <- colorNumeric("Blues", domain = mpls_all$proportion_suspicious)
leaflet(data = mpls_all) %>%
addProviderTiles(providers$Esri.WorldStreetMap) %>%
addPolygons(label = ~BDNAME,
fillColor = ~palette_mpls_all(proportion_suspicious),
stroke = FALSE,
fillOpacity = .7,
highlight = highlightOptions(color = "block",
fillOpacity = .8,
bringToFront = FALSE)) %>%
addLegend(pal = palette_mpls_all,
values = ~proportion_suspicious,
opacity = 0.4,
title = NULL,
position = "bottomright")
From the map we can see that the proportion of being suspicious is the highest on average across neighborhood on the bottom right part of Minneapolis. It also looks like this proportion is lower in the upper right part of the city and the bottom left area.
leaflet to create a map of your own choosing. Come up with a question you want to try to answer and use the map to help answer that question. Describe what your map shows.pal <- colorFactor(
palette = "viridis",
domain = MplsStops$preRace)
leaflet(data = MplsStops,
options = leafletOptions(preferCanvas = TRUE)) %>%
addProviderTiles(providers$Stamen.TonerHybrid,
options = providerTileOptions(updateWhenZooming = FALSE,updateWhenIdle = TRUE)) %>%
addCircles(lng = ~long,
lat = ~lat,
label = ~preRace,
weight = 10,
stroke = FALSE,
opacity = 1,
color = ~pal(preRace)) %>%
addLegend(pal = pal,
values = ~preRace,
opacity = 1,
title = "Officer's assessment of race of the person stopped before speaking with the person stopped",
position = "bottomright")
DID YOU REMEMBER TO UNCOMMENT THE OPTIONS AT THE TOP?